library(dplyr)
library(plotly)
library(flexdashboard)
#devtools::install_github("GIST-ORNL/wbstats")
library(wbstats)
#install.packages("countrycode")
library(countrycode)
library(rvest)
library(RColorBrewer)
library(lubridate)
library(broom)
library(DT)
#install.packages("listviewer")
library(listviewer)
library(tidyr)
#devtools::install_github("EvanOdell/hansard")
library(hansard)
library(xts)
library(zoo)
library(readr)
library(fuzzyjoin)
library(stringi)
library(IRdisplay)
# Download and parse OECD table
oecd_members <- read_html('http://www.oecd.org/about/membersandpartners/list-oecd-member-countries.htm') %>%
html_nodes('table') %>%
html_table(fill = TRUE, header = TRUE)
# Convert to countrycodes
oecd_codes <- countrycode(oecd_members[[2]]$Country, origin = "country.name", destination = "iso2c")
# Remove non-countries
oecd_codes <- oecd_codes[!is.na(oecd_codes)]
# Get data for women in parliament across the world
women_parliament <- wb(indicator = "SG.GEN.PARL.ZS", country=oecd_codes, startdate = 1997, enddate = 2017, POSIXct = TRUE) %>%
transmute(date=date_ct, women_parliament=value, iso2c, country)
cbind(country=oecd_members[[2]]$Country, oecd_codes)
# Divide countries into bins by 2015 GDP/capita
country_bins = data.frame(label=c("Low Income", "Medium Income", "High Income"),
color=brewer.pal(3, "Set2"))
gdp_per_capita <- wb(indicator = "NY.GDP.PCAP.CD", country=oecd_codes, startdate = 2015, enddate = 2015) %>%
transmute(country=country,
gdp_per_capita=value,
binlabel=cut(log10(value),
breaks = 3, labels=country_bins$label),
bincolor=cut(log10(value),
breaks = 3, labels=country_bins$color))
# Get country population
#population <- wb(indicator = "SP.POP.TOTL", country=oecd_codes, startdate=1997, enddate=2017, POSIXct = TRUE) %>%
# transmute(country, date=date_ct, population=value)
# Alternative population source which is more up to date: http://data.worldbank.org/data-catalog/population-projection-tables
population <- read.csv("worldbank-population.csv") %>%
select(-Series.Name, -Series.Code) %>%
gather(date, population, -Country.Name, -Country.Code) %>%
mutate(date=as.Date(paste0(substr(date, 2, 5) , '-01-01'))) %>%
filter(Country.Code %in% countrycode(oecd_codes, origin = "iso2c", destination = "iso3c")) %>%
transmute(country=Country.Name,date,population)
p <-
gdp_per_capita %>% group_by(binlabel) %>% arrange(gdp_per_capita) %>%
plot_ly(x=~country, y=~gdp_per_capita,
xaxis=list(categorymode="array"), color=~binlabel,
type="bar") %>%
layout(xaxis=list(title="Country", tickangle=-45, showgrid=F),
yaxis=list(title="GDP per Capita", showgrid=F),
margin=list(b=100),
paper_bgcolor="transparent",
plot_bgcolor="transparent",
font=list(color="lightgrey"),
legend = list(x = 0.05, y = 1, bgcolor = "#333333"))
p
df <- women_parliament %>%
left_join(gdp_per_capita, by=c("country"="country")) %>%
left_join(population, by=c("country"="country", "date")) %>%
filter(!is.na(population)) %>%
group_by(country)
p <- df %>% filter(iso2c != "GB") %>%
plot_ly(x=~date, y=~women_parliament,
type = "scatter", mode="lines",
opacity=0.15, line=list(width=3), color=~binlabel,
text=~paste0("<b>", country, "</b>","<br>", year(date),": ",
round(women_parliament, 0), "%"),
hoverinfo="text",
colors=brewer.pal(3, "Set2")
)
low_income <- df %>% filter(binlabel=="Low Income")
low_income$loess <- fitted(loess(women_parliament ~ as.numeric(date),
weights = population, data=low_income))
# Calculate LOESS lines: locally weighted best fits across each income group
low_income <- df %>% filter(binlabel=="Low Income")
low_income$loess <- fitted(loess(women_parliament ~ as.numeric(date),
weights = log10(population), data=low_income))
medium_income <- df %>% filter(binlabel=="Medium Income")
medium_income$loess <- fitted(loess(women_parliament ~ as.numeric(date),
weights = log10(population), data=medium_income))
high_income <- df %>% filter(binlabel=="High Income")
high_income$loess <- fitted(loess(women_parliament ~ as.numeric(date),
weights = log10(population), data=high_income))
# Plot 'em
p <- add_lines(p, opacity=1, data=low_income, x=~date, inherit = F, line=list(width=4, dash="dash", color=brewer.pal(3, "Set2")[1]),
y=~loess, name="Low Income average",
text=~paste0("<b>", "Low Income average", "</b>","<br>", year(date), ": ",
round(loess, 0), "%"), hoverinfo="text")
p <- add_lines(p, opacity=1, data=medium_income, x=~date, line=list(width=4, dash="dash"),
y=~loess, name="Medium Income average", colors=brewer.pal(3, "Set2"),
text=~paste0("<b>", "Medium Income average", "</b>","<br>", year(date), ": ",
round(loess, 0), "%"), hoverinfo="text")
p <- add_lines(p, opacity=1, data=high_income, x=~date, line=list(width=4, dash="dash"),
y=~loess, name="High Income average", colors=brewer.pal(3, "Set2"),
text=~paste0("<b>", "High Income average", "</b>","<br>", year(date), ": ",
round(loess, 0), "%"), hoverinfo="text")
# Add UK line: would be nice to add it as part of the original group but can't specify opacity as an array
p <- df %>% filter(iso2c == "GB") %>%
add_trace(p, data=., x=~date, inherit = F, line=list(width=4, color="Red"),
y=~women_parliament, opacity=1.0, name="United Kingdom",
type="scatter", mode="lines",
text=~paste0("<b>", country, "</b>","<br>", year(date),": ",
round(women_parliament, 0), "%"), hoverinfo="text")
# Format the graph
p <- layout(p, hovermode="closest", xaxis=list(title="Year", showgrid=FALSE),
yaxis=list(title="% of women in parliament", showgrid=FALSE,
range=c(0,50)),
paper_bgcolor="transparent",
plot_bgcolor="transparent",
font=list(color="lightgrey"),
legend = list(yanchor="center"))
p
# Data from http://researchbriefings.files.parliament.uk/documents/SN01250/snsg-01250-Tables-for-download-Feb-2017-.xlsx
# Extracting from this excel file via read_excel() is buggy so I just copied and pasted it here.
women_historic <- "
Year Con Lab LD Other Total Pct_MPs
1918 0 0 0 1 1 0.10%
1922 1 0 1 0 2 0.30%
1923 3 3 2 0 8 1.30%
1924 3 1 0 0 4 0.70%
1929 3 9 1 1 14 2.30%
1931 13 0 1 1 15 2.40%
1935 6 1 1 1 9 1.50%
1945 1 21 1 1 24 3.80%
1950 6 14 0 1 21 3.40%
1951 6 11 0 0 17 2.70%
1955 10 14 0 0 24 3.80%
1959 12 13 0 0 25 4.00%
1964 11 18 0 0 29 4.60%
1966 7 19 0 0 26 4.10%
1970 15 10 0 1 26 4.10%
1974 9 13 0 1 23 3.60%
1974 7 18 0 2 27 4.30%
1979 8 11 0 0 19 3.00%
1983 13 10 0 0 23 3.50%
1987 17 21 2 1 41 6.30%
1992 20 37 2 1 60 9.20%
1997 13 101 3 3 120 18.20%
2001 14 95 5 4 118 17.90%
2005 17 98 10 3 128 19.80%
2010 49 81 7 6 143 22.00%
2015 68 99 0 24 191 29.40%
"
women_historic <- read.csv(text=women_historic, sep="\t") %>%
mutate(Year = as.Date(paste0(Year,"-01-01")), Pct_MPs = as.numeric(gsub("%$", "", Pct_MPs)))
# Get total number of commons MPs over time
# from https://en.wikipedia.org/wiki/Number_of_Westminster_MPs?oldformat=true#Number_of_MPs_by_country
number_of_constituencies <- "
Country 1918 1922 1945 1950 1955 1974 1983 1992 1997 2005 2010 2015
Total 707 615 640 625 630 635 650 651 659 646 650 650"
number_of_constituencies <- read.table(text=number_of_constituencies, header = T) %>%
gather(year, total) %>% # Turn wide table into long table
tail(n=-1) %>% # skip first row (old header)
mutate(year=as.Date(paste0(gsub("^X", "", year), "-01-01")), total=as.integer(total)) # Format year as a date
# Convert to time series to interpolate number of MPs between years
no_con <- xts(number_of_constituencies[,-1], order.by=number_of_constituencies[,1])
no_con <- seq(start(no_con), end(no_con), by="1 year") %>%
merge(no_con, .) %>%
na.approx(method="constant") # Use constant interpolation because MPs only change numbers at elections
# Convert back to dataframe
number_of_constituencies <- data.frame(year=index(no_con), total=no_con) %>% transmute(year, total_mps=no_con)
# Merge dataframes to get total number of MPs for each election year
historic_number_of_women <- left_join(women_historic, number_of_constituencies, by=c("Year"="year")) %>%
mutate(Pct_Women=round(Total/total_mps*100, digits=2)) # Check if Pct_MPs calculation is done correctly
head(historic_number_of_women)
# Historic election dates from http://researchbriefings.parliament.uk/ResearchBriefing/Summary/SN04512#fullreport
historic_election_dates <-
"1918-12-14
1922-11-15
1923-12-06
1924-10-29
1929-05-30
1931-10-27
1935-11-14"
historic_election_dates <- as.Date(strsplit(historic_election_dates, "\n")[[1]])
# Modern election dates from hansard
general_elections <- elections("all") %>% filter(electionType=="General Election")
# Concatenate modern election dates from hansard with historic ones and set as Year vector
historic_number_of_women$Year <- c(historic_election_dates, rev(as.Date(general_elections$date._value)))
p <-
historic_number_of_women %>%
select(-Total, -Pct_MPs, -total_mps, -Pct_Women) %>%
gather(party, number_of_women, -Year) %>%
plot_ly(x=~Year, y=~number_of_women, color=~party, mode="lines+markers", type="scatter", fill="tozeroy") %>%
# Format the graph
layout(hovermode="closest", xaxis=list(title="Year", showgrid=FALSE),
yaxis=list(title="% of women in parliament", showgrid=FALSE),
paper_bgcolor="transparent",
plot_bgcolor="transparent",
font=list(color="lightgrey"))
p
female_mps <- read_csv("women_parliament.csv",
col_types = cols(`Date first elected` = col_date(format = "%d/%m/%Y"),
X6 = col_character(), X7 = col_skip(),
X8 = col_skip()), skip = 1)
names(female_mps) <- c("name", "date.first.elected", "party", "constituency", "time.in.parliament", "notes")
## Clean up the dataframe
# Remove first line, which is blank
female_mps <- female_mps[-1,]
# If there is an NA in the party column, it is because the MP constituencies overflowed over two rows.
# Let's fix this by tacking on the overflowed constituency to the previous row
# Save the last row's constituency because this is accidentally overwritten by the next line
last_constituency <- tail(female_mps, n=1)$constituency
last_time <- tail(female_mps, n=1)$time.in.parliament
female_mps <-female_mps %>%
mutate(constituency=ifelse(lead(is.na(party)), # If the next row has a NA in the party column
paste0(constituency, lead(ifelse(is.na(constituency),
"", constituency))), # Take its constituency value and add it to this row's
constituency), # Otherwise, do nothing
time.in.parliament=ifelse(lead(is.na(party)), # Do the same for time.in.parliament
paste0(time.in.parliament, lead(ifelse(is.na(time.in.parliament), # Take value from row below and add it to the end of current row
"", paste0(";", time.in.parliament)))),
time.in.parliament)) %>% # Otherwise, do nothing
mutate(notes=ifelse(lead(is.na(party) & !is.na(name)), # Now check for instances where the notes have overflowed onto the next row
paste0(ifelse(is.na(notes), "", paste0(notes, ", ")), lead(name)), # Append notes onto the row
notes)) %>%
filter(!is.na(party)) # Filter out the unneeded rows
# Add the last constituency back in
female_mps$constituency[nrow(female_mps)] <- last_constituency
female_mps$time.in.parliament[nrow(female_mps)] <- last_time
# Make constituency and time.in.parliament into character type string
female_mps <- female_mps %>% mutate(constituency = as.character(constituency),
time.in.parliament = as.character(time.in.parliament))
female_mps %>% datatable()
# Separate different tenures as MPs into multiple rows
new_df <- data.frame()
female_mps$nth.term <- NA
for(i in 1:nrow(female_mps)) {
# Split time into multiple tenures using ; delimiter
time_split <- strsplit(as.character(female_mps[i,]$time.in.parliament), ";", fixed=T)[[1]]
time_split <- time_split[time_split != ""] # Remove empty strings
constituency_split <- strsplit(as.character(female_mps[i,]$constituency), ";", fixed=T)[[1]]
constituency_split <- constituency_split[constituency_split != ""]
# Loop over all times in parliament
if(length(time_split) > 1) {
# First correct the row in the original df
female_mps[i,]$time.in.parliament <- time_split[1]
# Set the first term number to 1
female_mps[i,]$nth.term <- 1
for(i_time in 2:length(time_split)) {
# Create new row with current row's values
new_row <- female_mps[i,]
new_row$time.in.parliament <- time_split[i_time] # Change new row's time.in.parliament to reflect the loop
new_row$nth.term <- i_time # Set the term number to the count so that we can track which term this is
if(length(time_split)==length(constituency_split)) { # If there are an equal number of constituency and time splits
# then we can assign them 1:1
# Correct the contituency value
female_mps[i,]$constituency <- constituency_split[1]
new_row$constituency <- trimws(strsplit(constituency_split[i_time], "(", fixed=T)[[1]][1]) # Change new row's constituency to reflect the loop
}
# Add row to df
new_df <- rbind(new_df, new_row)
}
} else if (length(constituency_split) > 1) {
# First correct the row in the original df
female_mps[i,]$constituency <- constituency_split[1]
female_mps[i,]$nth.term <- 1
# This is messy but if the MP does not have a term end, we tack on the start of the next term as the end of the previous one
if (tail(strsplit(time_split[1], "")[[1]], n=1) == "-") { # If first term range ends with a -, then there is no term end date
female_mps[i,]$time.in.parliament <- gsub("--", "-", paste0(time_split[1],
"-",
strsplit(gsub(")", "", strsplit(constituency_split[2],
"(",
fixed=T)[[1]][2],
fixed=T), "-", fixed=T)[1]), fixed=T)
} else {
female_mps[i,]$time.in.parliament <- gsub(")",
"",
strsplit(constituency_split[1],
"(", fixed=T)[[1]][2],
fixed=T)
}
# For each constituency that the MP has been elected in,
for(i_constituency in 2:length(constituency_split)) {
# Create new row with current row's values
new_row <- female_mps[i,]
new_row$nth.term <- i_constituency # Set the term number to the count so that we can track which term this is
# Change new row's constituency to reflect the loop
new_row$constituency <- trimws(strsplit(constituency_split[i_constituency], "(", fixed=T)[[1]][1])
# Do the same for the time.in.parliament by taking the time range from within brackets
new_row$time.in.parliament <- gsub(")", "", strsplit(constituency_split[i_constituency], "(", fixed=T)[[1]][2], fixed=T)
# Add row to df
new_df <- rbind(new_df, new_row)
}
} else { # For those MPS with only one term served
female_mps[i,]$nth.term <- 1
}
}
female_mps <- rbind(female_mps, new_df)
female_mps <- female_mps %>% mutate(time.in.parliament=sub(";", "", time.in.parliament)) # Remove superfluous ";"s
female_mps %>% filter(is.na(time.in.parliament))
female_mps <- female_mps %>% rowwise() %>%
mutate(term.start = trimws(gsub("*", "", strsplit(time.in.parliament, "-")[[1]][1], fixed=T)),
term.end=strsplit(time.in.parliament, "-")[[1]][2],
current=substring(time.in.parliament, nchar(time.in.parliament))=="-",
by.election=grepl("*", time.in.parliament, fixed=T)) # If * in time.in.parliament, then MP was elected at a by-election
# Clean up term.end: need to turn (35, Feb 60, 2005, 10, NA) into (1935, 02/1960, 2005, 2010, [term.start])
female_mps <-
female_mps %>% rowwise() %>% mutate(term.start = ifelse(nchar(term.start) == 4,
as.Date(paste0(term.start, "-01-01")),
as.Date(paste("01", term.start), format="%d %b %Y")),
term.end = ifelse(!grepl("-", time.in.parliament), # If no dash is present
term.start, # then MP ended term in the same year
ifelse(nchar(term.end) == 2, # Otherwise, if year is 2 char
ifelse(as.integer(term.end) > 17, # And if term.end > 17
as.Date(paste0("19", term.end, "-01-01")), # Then the year is in 1900s
as.Date(paste0("20", term.end, "-01-01"))), # Otherwise, it's in 2000s
ifelse(nchar(term.end) == 4, # If year is instead 4 char
as.Date(paste0(term.end, "-01-01")), # Just convert to date
as.Date(paste("01", term.end), format="%d %b %y"))))) %>%# If it has format Feb 60 -> 01 Feb 60 and convert to Date
mutate(term.start = as.Date(term.start),
term.end = as.Date(term.end))
female_mps <- female_mps %>% mutate(term.end=as.Date(term.end))
# Use different name for better matching
female_mps[female_mps$name=="Catherine SMITH",]$name <- "Cat SMITH"
female_mps[female_mps$name=="Caroline DINEAGE",]$name <- "Caroline DINENAGE"
female_mps[female_mps$name=="Miss Margaret JACKSON",]$name <- "Rt Hon Dame Margaret BECKETT"
# Cross reference with list of women MPs on data.parliament.uk
library(jsonlite)
library(httr)
female_mps_from_mnis <- fromJSON(content(GET(url="http://data.parliament.uk/membersdataplatform/services/mnis/members/query/House=Commons|Gender=F|Membership=all", add_headers(`content-type`="application/json")), type="text"))
female_mps_from_mnis <- female_mps_from_mnis$Members[[1]] %>%
select(DisplayAs, House, HouseStartDate, HouseEndDate) %>%
rowwise() %>%
mutate(HouseEndDate=as.Date(ifelse(typeof(HouseEndDate)=="character",
HouseEndDate,
NA),
format="%Y-%m-%d"))
honorifics <- 'Mr|Mrs|Ms|Miss|Advocate|Ambassador|Baron|Baroness|Brigadier|Canon|Captain|Chancellor|Chief|Col|Comdr|Commodore|Councillor|Count|Countess|Dame|Dr|Duke of|Earl|Earl of|Father|General|Group Captain|H R H the Duchess of|H R H the Duke of|H R H The Princess|HE Mr|HE Senora|HE The French Ambassador M|His Highness|His Hon|His Hon Judge|Hon|Hon Ambassador|Hon Dr|Hon Lady|Hon Mrs|HRH|HRH Sultan Shah|HRH The|HRH The Prince|HRH The Princess|HSH Princess|HSH The Prince|Judge|King|Lady|Lord|Lord and Lady|Lord Justice|Lt Cdr|Lt Col|Madam|Madame|Maj|Maj Gen|Major|Marchesa|Marchese|Marchioness|Marchioness of|Marquess|Marquess of|Marquis|Marquise|Master|Mr and Mrs|Mr and The Hon Mrs|President|Prince|Princess|Princessin|Prof|Prof Emeritus|Prof Dame|Professor|Queen|Rabbi|Representative|Rev Canon|Rev Dr|Rev Mgr|Rev Preb|Reverend|Reverend Father|Right Rev|Rt Hon|Rt Hon Baroness|Rt Hon Lord|Rt Hon Sir|Rt Hon The Earl|Rt Hon Viscount|Senator|Sir|Sister|Sultan|The Baroness|The Countess|The Countess of|The Dowager Marchioness of|The Duchess|The Duchess of|The Duke of|The Earl of|The Hon|The Hon Mr|The Hon Mrs|The Hon Ms|The Hon Sir|The Lady|The Lord|The Marchioness of|The Princess|The Reverend|The Rt Hon|The Rt Hon Lord|The Rt Hon Sir|The Rt Hon The Lord|The Rt Hon the Viscount|The Rt Hon Viscount|The Venerable|The Very Rev Dr|Very Reverend|Viscondessa|Viscount|Viscount and Viscountess|Viscountess|W Baron|W/Cdr'
honorifics <- paste0("(\\b", paste(strsplit(honorifics, "|", fixed=T)[[1]], collapse="\\b|\\b"), "\\b)")
unwanted_chars <- "'|’|-| |\\.|,"
female_mps_from_mnis <- female_mps_from_mnis %>%
mutate(clean.name = tolower(trimws(stri_trans_general(gsub(unwanted_chars, "", gsub(honorifics, "", DisplayAs)), "Latin-ASCII"))))
female_mps <- female_mps %>%
mutate(clean.name = tolower(trimws(stri_trans_general(gsub(unwanted_chars, "", gsub(honorifics, "", name)), "Latin-ASCII")))) %>%
stringdist_left_join(female_mps_from_mnis) %>%
mutate(term.end=as.Date(ifelse(!is.na(HouseEndDate) & current==T, HouseEndDate, term.end))) %>%
select(-DisplayAs, -House, -HouseStartDate, -HouseEndDate, -clean.name.y) %>%
filter(!is.na(time.in.parliament)) # Remove rows without a time in parliament: likely to be spurious at this point.
output <- female_mps %>%
mutate(party=ifelse(party=="Lab", "red",
ifelse(party=="Con", "blue",
ifelse(party %in% c("Lib Dem", "LD"), "orange",
ifelse(party=="SNP", "yellow",
ifelse(party=="Green", "green",
"black")))))) %>%
mutate(term.end=as.Date(ifelse(is.na(term.end),
as.Date("2020-01-01"),
term.end))) %>%
transmute(name, constituency, term_start=term.start, term_end=term.end, party, clean_name=clean.name.x) %>%
arrange(term_start)
streams <- c()
output$stream <- 0
for(i in 1:nrow(output)) {
row <- output[i,]
streams[i] <- 0
for(stream_id in 1:length(streams)) {
if(decimal_date(output[i,]$term_start) >= streams[stream_id]) {
streams[stream_id] <- decimal_date(output[i,]$term_end)
output[i,]$stream <- stream_id
break
}
}
}
output %>% write.csv("mps_over_time.csv", row.names=F)
p <- output %>% datatable()
saveWidget(p, "final_table.html")
display_html('<iframe src="final_table.html" width="100%" height="800">')